Load the Raw Dataset: Use Pyspark to the lightcast_data.csv file into a DataFrame: You can reuse the previous code. Copying code from your friend constitutes plagiarism. DO NOT DO THIS.
import pandas as pdimport plotly.express as pximport plotly.io as piofrom pyspark.sql import SparkSessionimport reimport numpy as npimport plotly.graph_objects as gofrom pyspark.sql.functions import col, split, explode, regexp_replace, transform, whenfrom pyspark.sql import functions as Ffrom pyspark.sql.functions import col, monotonically_increasing_idnp.random.seed(42)pio.renderers.default ="notebook"# Initialize Spark Sessionspark = SparkSession.builder.appName("LightcastData").getOrCreate()# Load Datadf = spark.read.option("header", "true").option("inferSchema", "true").option("multiLine","true").option("escape", "\"").csv("/home/ubuntu/assignment-03-t-primero/data/lightcast_job_postings.csv")df.createOrReplaceTempView("job_postings")# Show Schema and Sample Data# print("---This is Diagnostic check, No need to print it in the final doc---")# df.printSchema() # comment this line when rendering the submission# df.show(5)
WARNING: Using incubator modules: jdk.incubator.vector
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/24 03:08:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/24 03:08:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
[Stage 0:> (0 + 1) / 1] [Stage 1:> (0 + 1) / 1] 25/09/24 03:08:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
2 Data Preparation
We will be converting numerical columns to floats - this is so we can perform functions on it such as average.
3 Salary Distribution by Industry and Employment Type
Compare salary variations across industries. Filter the dataset Remove records where salary is missing or zero. Aggregate Data Group by NAICS industry codes. Group by employment type and compute salary distribution. Visualize results Create a box plot where: X-axis = ‘NAICS2_NAME’ Y-axis = ‘SALARY_FROM’, or ‘SALARY_TO’, or ‘SALARY’ Group by ‘EMPLOYMENT_TYPE_NAME’. Customize colors, fonts, and styles. Explanation: Write two sentences about what the graph reveals.
# Your Code for 1st question hereimport pandas as pdimport polars as pl# Filter out missing or zero salary valuespdf = df.filter(df["SALARY"] >0).select("EMPLOYMENT_TYPE_NAME", "SALARY").toPandas()# pdf.head()# Clean employment type names for better readability# This Basically looks for symbols numbers (which were incorrectly added into data name)pdf["EMPLOYMENT_TYPE_NAME"] = pdf["EMPLOYMENT_TYPE_NAME"].apply(lambda x: re.sub(r"[^\x00-\x7f]+", "", x))# pdf.head()# Compute median salary for sortingmedian_salaries = pdf.groupby("EMPLOYMENT_TYPE_NAME")["SALARY"].median()# median_salaries.head()# Sort employment types based on median salary in descending ordersorted_employment_types = median_salaries.sort_values(ascending=False).index# Apply sorted categoriespdf["EMPLOYMENT_TYPE_NAME"] = pd.Categorical( pdf["EMPLOYMENT_TYPE_NAME"], categories=sorted_employment_types, ordered=True)# Create box plot with horizontal grid linesfig = px.box( pdf, x="EMPLOYMENT_TYPE_NAME", y="SALARY", title="Salary Distribution by Employment Type", color_discrete_sequence=["orange"], # Single neutral color boxmode="group", points="all", # Show all outliers)# Improve layout, font styles, and axis labelsfig.update_layout( title=dict( text="Salary Distribution by Employment Type", font=dict(size=26, family="Verdana", color="black", weight="bold") # Bigger & Bold Title ), xaxis=dict( title=dict(text="Employment Type", font=dict(size=22, family="Verdana", color="black", weight="bold")), # Bigger X-label tickangle=0, # Rotate X-axis labels for readability tickfont=dict(size=18, family="Verdana", color="black", weight="bold"), # Bigger & Bold X-ticks showline=True, # Show axis lines linewidth=2, # Thicker axis lines linecolor="black", mirror=True, showgrid=False, # Remove vertical grid lines categoryorder="array", categoryarray=sorted_employment_types.tolist() ), yaxis=dict( title=dict(text="Salary (Thousands)", font=dict(size=22, family="Verdana", color="black", weight="bold")), # Bigger Y-label tickvals=[0, 50000, 100000, 150000, 200000, 250000, 300000, 350000, 400000, 450000, 500000], ticktext=["0", "50", "100", "150", "200", "250", "300", "350", "400", "450", "500"], tickfont=dict(size=18, family="Verdana", color="black", weight="bold"), # Bigger & Bold Y-ticks showline=True, linewidth=2, linecolor="black", mirror=True, showgrid=True, # Enable light horizontal grid lines gridcolor="lightgray", # Light shade for the horizontal grid gridwidth=0.5# Thin grid lines ), font=dict(family="Verdana", size=16, color="black"), boxgap=0.5, plot_bgcolor="white", paper_bgcolor="white", showlegend=False, height=800, width=900)# Show the figurefig.show()
[Stage 6:> (0 + 1) / 1]
4 Salary Distribution by Industry
pdf = df.select("NAICS2_NAME", "SALARY").toPandas()fig = px.box(pdf, x="NAICS2_NAME", y="SALARY", title="Salary Distribution by Industry", color_discrete_sequence=["#EF553B"])fig.update_layout(font_family="Arial", title_font_size=16, height=1000, width=1200)# Rotate x-axis labels for readabilityfig.update_xaxes(tickangle=45, tickfont=dict(size=12))fig.show()
[Stage 7:> (0 + 1) / 1]
5 Salary Analysis by ONET Occupation Type (Bubble Chart)
Analyze how salaries differ across ONET occupation types. Aggregate Data Compute median salary for each occupation in the ONET taxonomy. Visualize results Create a bubble chart where: X-axis = ONET_NAME Y-axis = Median Salary Size = Number of job postings Apply custom colors and font styles. Explanation: Write two sentences about what the graph reveals.
[Stage 8:> (0 + 1) / 1]
# Defining education level groupingslower_deg = ["Bachelor's", "Associate", "GED", "No Education Listed", "High school"]higher_deg = ["Master's degree", "PhD or professional degree"]# Adding EDU_GROUP columndf = df.withColumn("EDU_GROUP", when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in lower_deg])), "Bachelor's or lower") .when(col("EDUCATION_LEVELS_NAME").rlike("|".join([f"(?i){deg}"for deg in higher_deg])), "Master's or PhD") .otherwise("Other"))# Casting necessary columns to floatdf = df.withColumn("MAX_YEARS_EXPERIENCE", col("MAX_YEARS_EXPERIENCE").cast("float"))df = df.withColumn("Average_Salary", col("Average_Salary").cast("float"))# Filtering for non-null and positive valuesdf = df.filter( col("MAX_YEARS_EXPERIENCE").isNotNull() & col("Average_Salary").isNotNull() & (col("MAX_YEARS_EXPERIENCE") >0) & (col("Average_Salary") >0))# Filtering for just the two education groupsdf_filtered = df.filter(col("EDU_GROUP").isin("Bachelor's or lower", "Master's or PhD"))# Converting to Pandas for plottingdf_pd = df_filtered.toPandas()
Split into three groups based on ‘REMOTE_TYPE_NAME’: Remote Hybrid Onsite (includes [None] and blank) Plot scatter plots for each group using, ‘MAX_YEARS_EXPERIENCE’ (with jitter), ‘Average_Salary’, ‘LOT_V6_SPECIALIZED_OCCUPATION_NAME’ Also, create salary histograms for all three groups. After each graph, briefly describe any patterns or comparisons.
# Step 1: Create the Average_Salary column using SQLremote_salary_data = spark.sql("""SELECT MAX_YEARS_EXPERIENCE, (SALARY_FROM + SALARY_TO) / 2 AS Average_Salary, LOT_V6_SPECIALIZED_OCCUPATION_NAME, CASE WHEN REMOTE_TYPE_NAME = 'Remote' THEN 'Remote' WHEN REMOTE_TYPE_NAME = 'Hybrid' THEN 'Hybrid' ELSE 'Onsite' END AS REMOTE_GROUPFROM job_postingsWHERE SALARY_FROM IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM > 0 AND SALARY_TO > 0 AND MAX_YEARS_EXPERIENCE IS NOT NULL""")# Step 2: Convert to pandas for plottingdf_viz = remote_salary_data.toPandas()# Step 3: Create scatter plot (rest of code same as before)import plotly.express as pximport numpy as npnp.random.seed(42)df_viz['EXPERIENCE_JITTER'] = df_viz['MAX_YEARS_EXPERIENCE'] + np.random.uniform(-0.3, 0.3, len(df_viz))fig_scatter = px.scatter( df_viz, x='EXPERIENCE_JITTER', y='Average_Salary', color='REMOTE_GROUP', hover_data=['LOT_V6_SPECIALIZED_OCCUPATION_NAME'], title='Salary by Experience and Remote Work Type')fig_scatter.show()